import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_excel("C:\\Users\\vallabh\\Vallabha Datta\\Project\\df_reg.xlsx", delimiter=',')
df.head(5)
df = df.drop('Unnamed: 0', axis=1)
df.head(5)
# Checking for data types and null values
df.info()
From above info, there are 8 null values in "State" column
state_mean = int(df.State.mean())
state_mean
Replacing null value with mean For Montana cat code is 27, So, replacing missing state with Montana df.State.fillna(state_mean, inplace=True)
df.State.fillna(state_mean, inplace=True)
df.info()
df.shape
df['Unemployment_rate_2018'].describe()
# Changing Unemployment_rate_2018 to category to perform knn
# Dividing into 5 Categories
lst = []
for i in df.Unemployment_rate_2018:
if i<=3.1:
lst.append("Very low") # If Unemployment_rate is less than 3.1, then it is very low
elif i>3.1 and i<=3.9:
lst.append("Low") # If Unemployment_rate is between 3.1 & 3.9[including 3.9] then it is low
elif i>3.9 and i<=4.9:
lst.append("Average") # If Unemployment_rate is between 3.9 & 4.9[including 4.9] then it is Average
elif i>4.9 and i<=10:
lst.append("High") # If Unemployment_rate is between 4.9 & 10[including 10] then it is High
else:
lst.append("Very High") # If Unemployment_rate is greater than 10, then it is very high
lst
df.Unemployment_rate_2018 = lst
df['Unemployment_rate_2018'].head(5)
# Selecting Unemployment_rate_2018 as target variable
df_target = df['Unemployment_rate_2018']
df = df.drop('Unemployment_rate_2018', axis =1)
from sklearn import preprocessing
# Performing min-max scaling
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(df)
df_scaled = min_max_scaler.fit_transform(df)
df_scaled
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
df_scaled.head(5)
np.set_printoptions(precision=2, linewidth=100)
# Splitting data into training and testing
from sklearn.model_selection import train_test_split
df_train, df_test, df_target_train, df_target_test = train_test_split(df_scaled, df_target, test_size=0.2, random_state=478)
df_target_train.head(5)
df_target_test.head(5)
df_train.head(5)
df_test.head(5)
print("Shape of Train data: ", df_train.shape)
print("Shape of Test data: ", df_test.shape)
df_train_arr = np.array(df_train)
df_test_arr = np.array(df_test)
df_target_train_arr = np.array(df_target_train)
df_target_test_arr = np.array(df_target_test)
def knn_search(x, D, K, measure):
""" find K nearest neighbors of an instance x among the instances in D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
# first find the vector norm for each instance in D as wel as the norm for vector x
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
# Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
sims = np.dot(D,x)/(D_norm * x_norm)
# The distance measure will be the inverse of Cosine similarity
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], dists
neigh_idx, distances = knn_search(df_test_arr[0], df_train_arr, 5, 0)
print(neigh_idx)
print("\nNearest Neigbors:")
df_train.iloc[neigh_idx]
# printing distances of top 5 nearest neighbors
print(distances[neigh_idx])
neigh_labels = df_target_train_arr[neigh_idx]
print(neigh_labels)
Top 5 nearest neighbors are: High, Average, High, Average, Average
from collections import Counter
print(Counter(neigh_labels))
Counter(neigh_labels).most_common(1)
Since, majority of votes are Average for top 5 nearest neighbours, the predicted value of unemployment rate will fall under Average category
X_train, X_test, y_train, y_test = train_test_split(df_scaled, df_target, test_size=0.2, random_state=50)
from sklearn.neighbors import KNeighborsClassifier
Using KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Performing prediction
pred = knn.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
error_rate = []
for i in range(1,40):
knn= KNeighborsClassifier(n_neighbors = i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker = 'o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K value')
plt.xlabel('K')
plt.ylabel('Error Rate')
From the Error vs K plot, the best value of k is 3. So, performing prediction with 3 neighbors
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
Accuracy using knn classifier is 72%
2) USING TERM DOCUMENT CATEGORIZATION
df_train.shape
numTerms=df_train.shape[0]
NDocs = df_train.shape[1]
termFreqs = df_train.sum(axis=1)
termFreqs.head(5)
plt.plot(sorted(termFreqs, reverse=True))
plt.show()
DF = pd.DataFrame([(df_train!=0).sum(1)]).T
DF.head(5)
NMatrix=np.ones(np.shape(df_train), dtype=float)*NDocs
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix)
# Convert each entry into IDF values
# IDF is the log of the inverse of document frequency
# Note that IDF is only a function of the term, so all columns will be identical.
IDF = np.log2(np.divide(NMatrix, np.array(DF)))
IDF
TD_tfidf = df_train * IDF
TD_tfidf.head(10)
IDF.T[0].shape
Converting test data using TD*iDF
numTerms2=df_test.shape[0]
NDocs2 = df_test.shape[1]
DF2 = pd.DataFrame([(df_test!=0).sum(1)]).T
DF2.head(10)
NMatrix2=np.ones(np.shape(df_test), dtype=float)*NDocs2
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix2)
IDF2 = np.log2(np.divide(NMatrix2, np.array(DF2)))
IDF2
df2_tfidf = df_test * IDF2
df2_tfidf.head(5)
error_rate = []
for i in range(1,40):
knn= KNeighborsClassifier(n_neighbors = i)
knn.fit(df_train, df_target_train)
pred_i = knn.predict(df_test)
error_rate.append(np.mean(pred_i != df_target_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker = 'o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('Error Rate')
# From the above graph the best k value is 10
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(TD_tfidf, df_target_train)
pred = knn.predict(df2_tfidf)
print(confusion_matrix(df_target_test, pred))
print(classification_report(df_target_test, pred))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(df_target_test, pred))
From the above classification report, Wei_Ave using TD*IDF = 0.61, Wei_Ave using Knn = 0.73. So, knn model without using TD_IDF is best in this case.
Predicting the Unemploment_rate for random query
import random
x=[]
for i in range(1,29):
y = random.random()
x.append(y)
# Each term in query x must be multiplied by the idf value of the term we computed earlier (the IDF matrix)
x_tfidf = x * IDF[0] # note that this coordinatewise multiplication of two vectors
print(x_tfidf)
x_tfidf.shape
DT_tfidf = TD_tfidf
DT_array = np.array(DT_tfidf)
DT_array.shape
df.head(5)
DT_array
neigh_idx, distances = knn_search(x_tfidf, DT_array, 5, 0)
# Distances between query objects and training objects
distances
distances = pd.Series(distances, index=DT_tfidf.index)
print("Query:", x)
print("\nNeighbors:")
DT_tfidf.iloc[neigh_idx]
df_target.shape
cat_labels = np.array(df_target_train)
cat_labels = pd.Series(cat_labels, index=DT_tfidf.index)
DT_tfidf["Category"] = cat_labels
def knn_classify(x, D, K, labels, measure):
from collections import Counter
neigh_idx, distances = knn_search(x, D, K, measure)
neigh_labels = labels[neigh_idx]
count = Counter(neigh_labels)
print("Labels for top ", K, "neighbors: ", count)
return count.most_common(1)[0][0]
print("Instance to classify:\n", x)
print("Predicted Category for the new instance: ", knn_classify(x_tfidf, DT_array, 5, cat_labels, 0))
Predicted category for the new instance using TD*IDF is "High"
3) USING DECISION TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(df_train,df_target_train)
predictions = dtree.predict(df_test)
print(dtree.score(df_test, df_target_test))
print(dtree.score(df_train, df_target_train))
Tree performed well for training set than testing set. Tree score for training set is 1 which is not possible practically. So, we can infer that the decision tree model is too simple and underfitted
print(classification_report(df_target_test,predictions))
Accuracy using decision tree is 72%
Setting criterion to entropy
dtree = DecisionTreeClassifier(criterion = "entropy")
dtree.fit(df_train,df_target_train)
predictions = dtree.predict(df_test)
print(classification_report(df_target_test,predictions))
After setting criterion as "entropy", I got accuracy as 76%. Which is better than "gini" criterion
Changing default values in decision tree classifier
error_rate = []
for i in range(1,40):
dtree = DecisionTreeClassifier(criterion = "entropy", max_depth = i)
dtree.fit(df_train,df_target_train)
predictions = dtree.predict(df_test)
error_rate.append(np.mean(predictions != df_target_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker = 'o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs max_depth')
plt.xlabel('max_depth')
plt.ylabel('Error Rate')
# from the above plot, best value of max_depth =13
dtree = DecisionTreeClassifier(criterion = "entropy",max_depth = 13)
t = dtree.fit(df_train,df_target_train)
predictions = dtree.predict(df_test)
print(classification_report(df_target_test,predictions))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(df_target_test, predictions))
So, by using decision tree classifier the best accuracy achieved is 78%
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot
features = list(df.columns[0:])
features
Printing and exporting decision tree
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())
4)USING NAIVE BAYES (GAUSSIAN) CLASSIFIER
from sklearn import naive_bayes
nb_model = naive_bayes.GaussianNB()
nb_model = nb_model.fit(df_train, df_target_train)
nb_pred = nb_model.predict(df_test)
nb_pred[1:20]
print(nb_model.score(df_test, df_target_test))
print(nb_model.score(df_train, df_target_train))
confusion_mat = confusion_matrix(df_target_test, nb_pred)
print(confusion_mat)
print(classification_report(df_target_test, nb_pred))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(df_target_test, nb_pred))
Accuracy using Naive Bayes is 54.4%
5) USING RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100,criterion='entropy', random_state=None)
rfc.fit(df_train, df_target_train)
rfc_pred = rfc.predict(df_test)
print(confusion_matrix(df_target_test,rfc_pred))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(df_target_test, rfc_pred))
By using default values for RandomForestClassifier we got 78.69% accuracy
error_rate = []
for i in range(1,40):
rfc = RandomForestClassifier(n_estimators=100,criterion='entropy', random_state=None, max_depth=i)
rfc.fit(df_train, df_target_train)
rfc_pred = rfc.predict(df_test)
error_rate.append(np.mean(rfc_pred != df_target_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker = 'o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs max_depth')
plt.xlabel('max_depth')
plt.ylabel('Error Rate')
# best value of max_depth is 40
rfc = RandomForestClassifier(n_estimators=100,criterion='entropy', random_state=None,max_depth=40)
rfc.fit(df_train, df_target_train)
rfc_pred = rfc.predict(df_test)
print(confusion_matrix(df_target_test,rfc_pred))
print("Accuracy:",metrics.accuracy_score(df_target_test, rfc_pred))
By setting n_estimators to 100,criterion to 'entropy', random_state to None and max_depth to 40, the maximum accuracy achieved is 80.7%
Of all of the above classification techniques, Random Forest Classifier has maximum accuracy. So, let's perform prediction using random forest